#!/usr/bin/python3
# -*- coding: utf-8 -*-

from ui import wenkuUI, check_bdwenku_url
import requests
from bs4 import BeautifulSoup
import re
import json
import os 
import time
import chardet
from tools import image2PDF
from PyQt5.QtWidgets import QMessageBox




# 解决中文编码问题
def solve_encoding(r):
    if r.encoding == 'ISO-8859-1':
        encodings = requests.utils.get_encodings_from_content(r.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = r.apparent_encoding
        encode_content = r.content.decode(encoding, 'replace') 

        return encode_content

class BDWenKu(wenkuUI):
    def __init__(self, parent_path):
        # 父级目录 下载内容会再建子目录
        self.parent_path = parent_path
        self.childfolder_path : str
        self.js_list  = [] # 文档js地址列表
        self.pic_list = [] #  PDF 图片列表
        self.docType = '' #  文档类型
        self.doc_tile = '' #  文档名称

        wenkuUI.__init__(self) # UI
 
        # 是否有父级目录
        if not os.path.exists(self.parent_path):
            os.makedirs(self.parent_path)

    # 重写 wenkuUI.btnClik
    def btnClik(self):
        self.tbr.clear()

        # URL 校验正确
        if check_bdwenku_url(self.urlinput.text()):
            self.printlog('URL 格式正确 :)')
            # 开始解析
            self.run(self.urlinput.text())
        elif self.urlinput.text() == '':
            QMessageBox.information(self, '错误', '请输入URL :)')
        else:
            QMessageBox.information(self, '错误', '此URL格式错误！\n请检查:)')

    
    # 启动
    def run(self, url):
        self.printlog("开始解析文档URL :)")
        start = time.time()
        self.parseJS(url)
        
        if self.docType == 'pdf' or self.docType == 'ppt':
            savename = self.title + '.pdf'
            # 图片在子目录下
            self.printlog('所有图片下载完成, 开始转换生成PDF...')
            image2PDF(self, self.childfolder_path, savename, self.docType)
        elif self.docType == 'doc' or self.docType == 'txt':
            pass
        else:
            self.printlog("错误...")

        self.printlog('《' + self.title + '》下载完成！')
        p = os.path.join(os.getcwd(),'文库助手下载', self.title)
        self.printlog('保存路径为: ')
        self.printlog(p)
        end = time.time()
        self.printlog('总共耗时: '+ str(round(end - start)) + 's')



        


    # 解析url 
    def parseJS(self, url):
        r = requests.get(url)

        # 解决中文乱码
        encode_content = solve_encoding(r)

        soup = BeautifulSoup(encode_content, 'html.parser')
        script = soup.find_all('script',attrs={'type':'text/javascript'})

        for i in script:
            if 'WkInfo.htmlUrls' in i.text:
                raw_js = i.text
            if 'WkInfo.Urls' in i.text:
                Doc = i.text
        
        
        DocInfo = Doc.split('WkInfo.DocInfo=')[0]
        docType = re.findall(r'\'docType\': \'\w+\'',DocInfo)[0]
        # 文档类型
        self.docType = docType.split(':')[1].replace('\'','').strip(' ')
        title = re.findall(r'\'title\': \'.*\'',DocInfo)[0]
        docId = re.findall(r'\'docId\': \'.*\'',DocInfo)[0]
        docId = docId.split(':')[1].replace('\'', '').strip(' ')
        self.title = title.split(':')[1].replace('\'','').strip(' ')

        self.printlog('URL解析成功！')
        self.printlog('文档名称:《 %s 》' % self.title)
        self.printlog('文档类型:   %s'    % self.docType)


        if self.docType == 'doc':
            self.parseDoc(raw_js, self.title)
        elif self.docType == 'ppt':
            self.parsePPT(docId, self.title)
        elif self.docType == 'pdf':
            self.parsePDF(raw_js, self.title)
        elif self.docType == 'txt':
            self.printlog('txt 暂不支持')

    # 类型为DOC 
    def parseDoc(self, raw_js, file_name):
        raw_js = raw_js.split(' WkInfo.htmlUrls =')[1].split(';')[0]
        raw_js = raw_js.replace('\\x22', '').replace('\\', '')
        raw_js = re.findall(r'pageLoadUrl:.*\w', raw_js)[0].split(',')

        for i in raw_js:
            if 'json' in i:
                self.js_list.append(i.split(':',1)[1].replace('}','').strip(']'))
        
        self.printlog('Doc共有 %d 页' % len(self.js_list))

        for js_url in self.js_list:
            self.saveDoc(js_url, file_name, 'doc')

    # 类型为PPT
    def parsePPT(self, docID, file_name):
        r = requests.get('https://wenku.baidu.com/browse/getbcsurl?doc_id=%s&pn=1&rn=99999&type=ppt'%docID)
        result = r.json()

        self.printlog('PPT共有 %d 页' % len(result))
        # 建子文件夹
        self.childfolder_path = os.path.join(self.parent_path, self.title)
        if not os.path.exists(self.childfolder_path):
            os.mkdir(self.childfolder_path)
        
        # 下载每一页图片
        self.printlog('开始下载PPT图片...')
        for i in result:
            r=requests.get(i['zoom'])
            # 保存到新建子目录
            with open(self.childfolder_path + '/' +file_name +'%d.png'%i['page'],'wb') as fd:
                fd.write(r.content)
            self.printlog("第%d张图片下载完成！"%i['page'])


    # 类型为PDF
    def parsePDF(self, raw_js, file_name):
        raw_js = raw_js.split(' WkInfo.htmlUrls =')[1].split(';')[0]
        raw_js = raw_js.replace('\\x22', '').replace('\\', '')
        raw_js = re.findall(r'pageLoadUrl:.*\w', raw_js)[0].split(',')
        # print(add)
    
        for j in raw_js:
            if 'png' in j:
                self.pic_list.append(j.split(':', 1)[1].replace('}', '').strip(']'))
        self.pic_list.remove(self.pic_list[0])
        
        self.printlog('PDF 有%d页' % len(self.pic_list))

        # 建子文件夹
        self.childfolder_path = os.path.join(self.parent_path, self.title)
        if not os.path.exists(self.childfolder_path):
            os.mkdir(self.childfolder_path)

        self.printlog('开始下载PDF图片...')
        for i in range(len(self.pic_list)):
            r = requests.get(self.pic_list[i])
            with open(self.childfolder_path + '/' + file_name +'%d.png' % i, 'wb') as fd:
                fd.write(r.content)
            self.printlog("第%d张图片下载完成！"%(i+1))
            

    def saveDoc(self, js_url, file_name, save_type):
        r = requests.get(js_url.replace('\\', ''))
        # 转为json格式
        result = json.loads(r.text.split('(', 1)[1].strip(')'))
        body = result['body']

        if save_type == 'doc':
            endstr = '.doc'
        else: # 默认txt
            endstr = '.txt'
        
        # 建子文件夹
        self.childfolder_path = os.path.join(self.parent_path, self.title)
        if not os.path.exists(self.childfolder_path):
            os.mkdir(self.childfolder_path)

        for i in body:
            if i['t'] == 'word':
                text = i['c'] # 提取字符(串)

                if i['ps'] != None and '_enter' in i['ps'].keys():
                    text ='\n' # 添加换行
                
                with open(self.childfolder_path + '/' + file_name + endstr,'a', encoding='utf-8') as fd:
                    fd.write(text)





doc_url = 'https://wenku.baidu.com/view/22357e96492fb4daa58da0116c175f0e7dd11974.html?from=search'
url = 'https://wenku.baidu.com/view/9f9acebc78563c1ec5da50e2524de518964bd386.html?from=search'
txt_url = 'https://wenku.baidu.com/view/4b3360530b4c2e3f57276397.html?from=search'



if __name__ == "__main__":
    dir_path = './download/'
    # wenku = BDWenKu(dir_path)
    # #url = input('请输入网址：')
    # wenku.run(doc_url)